# 
# 1. read in directives and regulations
# 2. clean up to one data.table
# 3. Drop irrelevant obsevations (Commission, French etc)
library(stringr)
library(data.table)

### the function assumes that text-files of directives and regulations are 
# stored in the "data/legislation/type" folder, where type is directive / regulation  
make_data_table <- function(type){
  tmp_list <- list.files(here::here("data","legislation",type))
  tmp<- parallel::mclapply(seq(tmp_list), function(i){ 
    readLines(here::here("data","legislation",type, tmp_list[i]))
  }, mc.cores = parallel::detectCores()-1)
titles <- sapply(seq(tmp), function(i) tmp[[i]][1])
dates <- as.Date(sapply(seq(tmp), function(i) tmp[[i]][2]))
years <- as.numeric(sapply(seq(tmp), function(i) tmp[[i]][3]))
legis_type <- sapply(seq(tmp), function(i) tmp[[i]][4])
celex <- sapply(seq(tmp), function(i) tmp[[i]][5])
inforce <- as.Date(sapply(seq(tmp), function(i) tmp[[i]][8])) 
expire<- as.Date(sapply(seq(tmp), function(i) tmp[[i]][9])) 
euroVoc = sapply(seq(tmp), function(i) tmp[[i]][11])
keywords <- sapply(seq(tmp), function(i) tmp[[i]][12])
fulltext <- sapply(seq(tmp), function(i) tmp[[i]][13])

data.table("title" = titles,
                      "date" = dates,
                      "year" = years,
                      "legis_type" = legis_type,
                      "celex" = celex,
                      "in_force" = inforce,
                      "expire" = expire,
                      "euroVoc" = euroVoc,
                      "keywords" = keywords,
                      "fulltext" = fulltext)
}

directives <- make_data_table("directive")
regulations <- make_data_table("regulation")

legis <- rbind(directives, regulations)
legis <- legis[order(year, date)]

### drop commission, french and duplicated legislation
not_commission_reg <- !grepl("Commission Regulation", legis$title)
legis <- legis[not_commission_reg,,drop = FALSE]
not_commission_dir <- !grepl("Commission Directive", legis$title)
legis <- legis[not_commission_dir,,drop = FALSE]
not_duplicated <- !duplicated(legis$fulltext)
legis <- legis[not_duplicated,, drop = FALSE]
not_french <- !grepl("Règlement", legis$title)
legis <- legis[not_french, , drop = FALSE]

legis[, fulltext := sapply(fulltext, function(x) tolower(x)),]
legis[, fulltext := sapply(fulltext, function(x) stringr::str_split_i(x, "done at brussels", i = 1)),]
legis[, fulltext := sapply(fulltext, function(x) stringr::str_split_i(x, "done at luxembourg", i = 1)),]
legis <- legis[, fulltext := sapply(fulltext, function(x) stringr::str_squish(x)),]

save(legis, file = "legisdata.RData")
